rm(list=ls())
setwd("/Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis")
#library(GEOquery)
library(SummarizedExperiment)
library(SingleCellExperiment)
library(GEOquery)
library(ISLET)
library(Seurat)
library(MuSiC)
library(dplyr)
library(ggplot2)
library(tidyr)
library(fgsea)
library(Matrix)
library(pheatmap)
library(stringr)
library(gridExtra)
library(reshape2)
library(biomaRt)
library(linseed)
library(DWLS)
library(FARDEEP)
library(MCMCpack)
library(PROPER)
library(edgeR)
library(MASS)
library(grid)
library(pROC)
library(ROCR)
library(DESeq2)

1. Parameter Estimation

real_gene_count <- read.table("/Users/ziyiou/CUHKSZ/23 Fall/Genomics/longitudinal analysis/ISLET simulation/GSE60424_GEOSubmit_FC1to11_normalized_counts.txt", row.names = 1)

data <- getGEO("GSE60424")
gset <- data[[1]]
real_pdata <- pData(gset)
colnames(real_gene_count) <- real_pdata$geo_accession
# 指定 Ensembl 物种和版本
ensembl <- biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")

# 获取 Ensembl ID 和基因名称的映射
annotation <- biomaRt::getBM(attributes = c("ensembl_gene_id", "external_gene_name"), mart = ensembl)

# 使用映射将 Ensembl ID 转换为基因名称
gene_names <- annotation$external_gene_name[match(rownames(real_gene_count), annotation$ensembl_gene_id)]  ## 存在NA和""

# 查找 Ensembl ID 中不存在于数据库中的项
missing_ids <- setdiff(rownames(real_gene_count), annotation$ensembl_gene_id)

# 打印不存在的 Ensembl ID
cat("Number of ensemble ID not exists in biomaRt database:", length(missing_ids))


# 更新 real_gene_count 和 gene_names
real_gene_count <- real_gene_count[!is.na(gene_names) & gene_names != "", ]
gene_names <- gene_names[!is.na(gene_names) & gene_names != ""]

## 存在一个gene id对应多个ensemble id的情况,此处直接删除
duplicate_indices <- duplicated(gene_names)
duplicated_genes <- gene_names[duplicate_indices]
print(unique(duplicated_genes))

real_gene_count <- real_gene_count[!duplicate_indices, ]
gene_names <- gene_names[!duplicate_indices]
rownames(real_gene_count) <- gene_names

cat("Number of genes in bulk reference matrix after mapping ensemble id to gene names:", nrow(real_gene_count))
saveRDS(real_gene_count, file = "real_gene_count.rds")
saveRDS(real_pdata, file = "real_pdata.rds")
################ meta data ################
real_gene_count = readRDS("real_gene_count.rds")
real_pdata = readRDS("real_pdata.rds")

real_meta_data <- data.frame(group = ifelse(real_pdata$`diseasestatus:ch1` == "Healthy Control", "ctrl", 
                      "case"),
                      subject_ID = as.numeric(real_pdata$`donorid:ch1`),
                      age = as.numeric(real_pdata$`age:ch1`),
                      cellType = real_pdata$`celltype:ch1`)
rownames(real_meta_data) <- real_pdata$geo_accession


# 去除细胞类型为"Whole Blood"的样本
filtered_meta_data <- subset(real_meta_data, cellType != "Whole Blood")
filtered_gene_count <- real_gene_count[colnames(real_gene_count) %in% rownames(filtered_meta_data), ]

# 确保行名的顺序匹配
filtered_gene_count <- filtered_gene_count[, rownames(filtered_meta_data)]


ctrl_meta_data <- subset(filtered_meta_data, group == "ctrl")
case_meta_data <- subset(filtered_meta_data, group == "case")

ctrl_gene_count <- filtered_gene_count[, rownames(ctrl_meta_data)]
case_gene_count <- filtered_gene_count[, rownames(case_meta_data)]

ctrl_gene_count <- ctrl_gene_count
################ 

We use DESeq2 to do the parameter estimation.

We get the mean expression of gene and perform log translation to it, save it as mu_m.rds.

In the reference panel, we assume the mean expression of gene \(g\) for subject \(j\) at time follows \(N(\mu _m, \sigma_m^2)\), where \(\sigma_m\) is estimated by estimateDispersions function in DESeq2. We save it as var_m.rds.

dds <- DESeqDataSetFromMatrix(countData = ctrl_gene_count,
                              colData = ctrl_meta_data,
                              design = ~ 1)
dds
## class: DESeqDataSet 
## dim: 29920 24 
## metadata(1): version
## assays(1): counts
## rownames(29920): FGR CFH ... LINC00662 IGHV3OR16-15
## rowData names(0):
## colnames(24): GSM1479438 GSM1479439 ... GSM1479524 GSM1479525
## colData names(4): group subject_ID age cellType
dds1 <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 96 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(dds1)
res
## log2 fold change (MLE): Intercept 
## Wald test p-value: Intercept 
## DataFrame with 29920 rows and 6 columns
##               baseMean log2FoldChange     lfcSE      stat       pvalue
##              <numeric>      <numeric> <numeric> <numeric>    <numeric>
## FGR          857.34899        9.74374  0.443362  21.97692 4.78836e-107
## CFH            2.73618        1.45216  0.503580   2.88368  3.93055e-03
## FUCA2         17.96582        4.16718  0.312560  13.33243  1.49937e-40
## GCLC          20.26188        4.34070  0.189674  22.88500 6.55419e-116
## NFYA          82.97725        6.37464  0.164013  38.86666  0.00000e+00
## ...                ...            ...       ...       ...          ...
## CYP3A54P      0.000000             NA        NA        NA           NA
## LINC02164     0.000000             NA        NA        NA           NA
## TUBB8P7       0.127477      -2.971695  1.658891 -1.791375    0.0732332
## LINC00662     0.879693      -0.184927  0.347799 -0.531708    0.5949283
## IGHV3OR16-15  0.000000             NA        NA        NA           NA
##                      padj
##                 <numeric>
## FGR          1.38268e-106
## CFH           6.03569e-03
## FUCA2         3.13892e-40
## GCLC         1.96268e-115
## NFYA          0.00000e+00
## ...                   ...
## CYP3A54P               NA
## LINC02164              NA
## TUBB8P7         0.0841442
## LINC00662       0.6180206
## IGHV3OR16-15           NA
dds <- estimateSizeFactors(dds)
dds <- estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
plotDispEsts(dds)

mu_m <- data.frame(res$baseMean)
#mu_m <- log(mu_m)
rownames(mu_m) <- rownames(ctrl_gene_count)

dispersion_m <- data.frame(dispersions(dds))
rownames(dispersion_m) = rownames(dispersion_m)


# 标记 mu_m 中包含 -Inf 的行
non_inf_rows <- rowSums(mu_m == -Inf) == 0

# 标记 var_m 中包含 NA 的行
non_na_rows <- rowSums(is.na(dispersion_m)) == 0

# 取交集:同时满足 non_inf_rows 和 non_na_rows 的行
valid_rows <- non_inf_rows & non_na_rows
gene <- rownames(mu_m)[valid_rows]

# 筛选出 mu_m 和 var_m 中保留的行
mu_m_filtered <- mu_m[valid_rows, ]
dispersion_m_filtered <- dispersion_m[valid_rows, ]

mu_m <- data.frame(mu_m_filtered)
rownames(mu_m) = gene
mu_m <- head(mu_m, 5000)

dispersion_m <- data.frame(dispersion_m_filtered)
rownames(dispersion_m) = gene
dispersion_m <- head(dispersion_m, 5000)

var_m = 1/mu_m + dispersion_m

In DESeq2, variance \(\sigma^2 = \mu + \alpha\mu^2\), here \(\mu\) is mean and \(\alpha\) is the dispersion. Since in the following simulation we simulate count in log scale, and by Taylor Expansion we know that \(log(X) ≈ log(\mathbb{E}[X])+ \mathbb{E}[X] (X−\mathbb{E}[X])\), therefore \(\mathrm{Var}[log(X)] = \displaystyle \frac{\mathrm{Var}[X]}{\mathbb{E}^2[X]}\)

saveRDS(log(mu_m), file="mu_m.rds")
saveRDS(dispersion_m, file = "dispersion_m.rds")
saveRDS(var_m, file = "var_m.rds")
mu_m = readRDS("mu_m.rds")
mu_m = mu_m$mu_m_filtered
sigma_m = readRDS("var_m.rds")

cellType_list = c("B-cells", "CD4", "CD8", "NK", "Neutrophils", "Monocytes")
gene_list = rownames(sigma_m)

2. Temporal cell type proportions

We assume the proportion \(\theta_{jt}\) of each subject \(j\) for \(K\) cell types at time \(t\) follows Dirichlet distribution with parameter \(\alpha\), where \(\alpha\) and \(\theta_{jt}\) are both \(K \times 1\) vectors, \(\mathbf{1}^T\theta_{jt} = 1\).

################# Notation Setup #################

#### input ####
# J: number of subject (must be an even number)
# T: number of time point (must be an even number)
# alpha_ls: cell population composition parameters for Dirichlet dist.  (list(alpha_c, alpha_d))
# cell_ls: list of cell types

#### output ####
# theta: cell type proportion

#### This `Gen_prop` function is for generating cell type proportion

Gen_prop <- function(J, T, alpha_ls, cell_ls) {
  set.seed(123)
  alpha_c = alpha_ls[["alpha_c"]]  # Dirichilet distribution params for control group
  alpha_d = alpha_ls[["alpha_d"]]  # Dirichilet distribution params for case group
  
  ctrl_theta <- rdirichlet(n=J*T/2, alpha = alpha_c)
  case_theta <- rdirichlet(n=J*T/2, alpha = alpha_d)
  
  colnames(ctrl_theta) <- cell_ls
  colnames(case_theta) <- cell_ls
  rownames(case_theta) <- seq(from = 1, to = J * T / 2) # in accordance with sample_ID in metadata
  rownames(ctrl_theta) <- seq(from = J * T / 2 + 1, to = J * T)
  
  theta <- rbind(case_theta, ctrl_theta) # case is on the top
  
  return(theta = t(theta)) 
  # transpose: let theta be a n*K matrix (n is the number of samples)
}

In the paper, author provided us with the parameter value for the Dirichlet distribution:

\[\alpha_c = (8.85, 6.49, 5.98, 5.28, 4.22, 3.85)\]

\[\alpha_d = (1.90, 2.25, 2.10, 5.72, 7.33, 15.37)\]

ctrl_alpha <- c(8.85, 6.49, 5.98, 5.28, 4.22, 3.85) # ctrl group
case_alpha <- c(1.90, 2.25, 2.10, 5.72, 7.33, 15.37) # case group
alpha_list = list(alpha_c = ctrl_alpha, alpha_d = case_alpha)

theta = Gen_prop(J=50, T=5, alpha_ls = alpha_list, cell_ls = cellType_list)
theta[, 1:5]
##                      1          2          3          4          5
## B-cells     0.02230079 0.01501410 0.01314183 0.04691058 0.04692466
## CD4         0.06623229 0.08058458 0.06463024 0.12883512 0.12880281
## CD8         0.07401944 0.05478895 0.14144167 0.03732913 0.10845406
## NK          0.14745136 0.11669351 0.20039315 0.18620546 0.17347204
## Neutrophils 0.30466902 0.20367899 0.21962805 0.28593910 0.24653747
## Monocytes   0.38532711 0.52923987 0.36076506 0.31478060 0.29580895

3. Individual reference panel and temporal gene expression

For gene \(g\), the mean expression of cell type \(k\) per subject \(j\) at time \(t\) is \(M_{gjkt} \sim N(\mu^{t_j}_k, \sigma_g^2)\) (\(\sigma_g\) is the overdispersion of each gene estimated by DESeq2).

The true reference matrix is \(\lambda_{gjkt} \sim \Gamma(\mathrm{exp}(-\Phi_{gjk}), M_{gjkt}\mathrm{exp}(\Phi_{gjk}))\), where \(M_{gjkt}\) and \(\Phi_{gjk}\) are components in vector \(\mathbf{M_{gjt}}, \mathbf{\Phi_{gj}}\), and \(\mathrm{exp}(-\Phi_{gjk})\) is shape, \(M_{gjkt}\mathrm{exp}(\Phi_{gjk})\) is scale. (\(\Phi_{gjk} \sim N(-3, 1)\)).

Note that \(M_{gjkt}\) and \(\Phi_{gjk}\) are both in log scale.

(1) Main Function

control_matrix <- function(cell_ls, gene_ls, DEG_ls, mu_jt, delta_jt) {
  G = length(gene_ls) # number of genes
  K = length(cell_ls)
  
  
  control <- matrix(0, nrow = G, ncol = K)
  colnames(control) = cell_ls
  rownames(control) = gene_ls
  
  for (k in 1:K) {
    deg_indices = DEG_ls[[cell_ls[k]]]$Type3
    control[deg_indices, ] = delta_jt  # k*delta_jt
  }
  
  control = control + mu_jt
  
  return(control)
  
}

case_matrix <- function(cell_ls, gene_ls, DEG_ls, mu_jt, delta_jt) {
  G = length(gene_ls) # number of genes
  K = length(cell_ls)
  
  
  case <- matrix(0, nrow = G, ncol = K)
  colnames(case) = cell_ls
  rownames(case) = gene_ls
  
  for (k in 1:K) {
    deg_indices = DEG_ls[[cell_ls[k]]]$Type3
    case[deg_indices, ] = delta_jt # k*delta_jt
    case[deg_indices, k] = 0
  }
  
  case = case + mu_jt
  
  return(case)
  
}
################# Notation Setup #################

#### input ####
# J: number of subject (must be an even number)
# T: number of time point (must be an even number)
# mu_m
# sigma_m
# gene_ls: list of gene names
# cell_ls: list of cell types
# Delta_0 (LFC): age-independent group effect (intercept)
# k_mu: slope in baseline group
# k_delta: age-dependent group effect (slope)

#### output ####
# M: mean expression of gene
# Phi: overdispersion
# DEG_ls: list of differentially expressed genes (list(Type1, Type2, Type3))
# lambda: true reference expression
# meta_data

#### This `Gen_ref` function is for generating longitudinal individual reference panel

Gen_ref <- function(J, T, mu_m, sigma_m,
                       gene_ls, cell_ls, LFC=1.5, k_mu = 0.01, k_delta = 0.1) {
  set.seed(123) 
  
  ### meta data
  meta_data <- data.frame(sample_ID = 1:(J*T),
                          group = rep(c("case", "ctrl"), each = T*J/2), 
                          subject_ID = rep(1:J, each=T), 
                          time_point = rep(1:T, J))
  
  
  
  ### marker gene index generation
  G = length(gene_ls) # number of genes
  K = length(cell_ls)
  
  sample_size <- ceiling(0.05 * G) # 5% DEG of each cell type
  DEG_ls <- setNames(vector("list", length(cell_ls)), cell_ls)
  
  for (k in 1:K) {
    
    DEG_ls[[cell_ls[k]]] = list(
      Type3 = seq(((k-1)*sample_size)+1, (k*sample_size))) }
  

  ### reference panel generation
  mu_m = matrix(rep(mu_m, times = K), nrow = G, ncol = K)
  rownames(mu_m) = gene_ls
  colnames(mu_m) = cell_ls
  
  sigma_m = data.frame(rep(sigma_m, each=K))
  rownames(sigma_m) = gene_ls
  colnames(sigma_m) = cell_ls
  
  ### Generate M
  M_ls = list()
  shift = ceiling(T/2) 
  
  for  (j in (1:(J %/% 2))){ 
    mu_jt = mu_m
    delta_jt = LFC
    for (t in (1:T)) { # linear 
      #if (t %in% (1:shift)) {mu_m_case = mu_m_case + Delta3}
      #else {mu_m_case = mu_m_case - Delta3}
      mu_jt = mu_jt + k_mu*t
      delta_jt = delta_jt + k_delta*t
      mu_m_case = case_matrix(cell_ls = cell_ls, gene_ls = gene_ls, DEG_ls = DEG_ls, mu_jt = mu_jt, delta_jt = delta_jt)
      M <- mapply(rnorm, n = 1, mean = mu_m_case, sd = sigma_m)
      M <- matrix(M, nrow = nrow(mu_m_case), ncol = ncol(mu_m_case), 
             dimnames = dimnames(mu_m_case))
    
    # current idx: (j-1)*T + t
      M_ls[[as.character((j - 1) * T + t)]] <- M

   }
  }
  
  for  (j in (((J %/% 2)+1):J)){ 
    mu_jt = mu_m
    delta_jt = LFC
    for (t in (1:T)) { 
      mu_jt = mu_jt + k_mu*t
      delta_jt = delta_jt + k_delta*t
      mu_m_ctrl = control_matrix(cell_ls = cell_ls, gene_ls = gene_ls, DEG_ls = DEG_ls, mu_jt = mu_jt, delta_jt = delta_jt)
      M <- mapply(rnorm, n = 1, mean = mu_m_ctrl, sd = sigma_m)
      M <- matrix(M, nrow = G, ncol = K, 
             dimnames = dimnames(mu_m_ctrl))
    
    # current idx: (j-1)*T + t
      M_ls[[as.character((j - 1) * T + t)]] <- M

    }
  }
  
  ## Generate Phi
  Phi_ls = list()
  for (j in 1:J) {
    for (t in 1:T) {
    # 生成 Phi 矩阵
    Phi <- mapply(rnorm, n = G*K, mean = -3, sd = 1) 
    Phi <- matrix(Phi, nrow = G, ncol = K, 
             dimnames = dimnames(mu_m_ctrl))
    Phi_ls[[as.character((j - 1) * T + t)]] <- Phi }
    }
  
  ## Generate lambda
  lambda_ls = list()
  for  (j in (1:J)){ 
    for (t in (1:T)){
      idx = (j - 1) * T + t
      M = M_ls[[as.character(idx)]]
      Phi = Phi_ls[[as.character(idx)]]
      lambda <- matrix(nrow = nrow(M), ncol = ncol(M))
      
      shape_matrix <- exp(-Phi)
      rate_matrix <- 1 / (exp(M) * exp(Phi))
      
      lambda <- mapply(function(shape, rate) {
          if (shape <= 0 || rate <= 0) {
            return(0)} else {
    return(rgamma(n = 1, shape = shape, rate = rate))}}, 
    as.vector(shape_matrix), as.vector(rate_matrix))
      
      # reorganize lambda into a matrix
      lambda <- matrix(lambda, nrow = nrow(M), ncol = ncol(M))
      lambda[lambda > 20000] <- 20000
      colnames(lambda) = colnames(M)
      rownames(lambda) = rownames(M)
      
      lambda_ls[[as.character(idx)]] <- lambda
    }
  }
  
  
  return (list(DEG_ls = DEG_ls, M_ls = M_ls, Phi_ls = Phi_ls, lambda_ls=lambda_ls, meta_data = meta_data))
}

(2) Heatmap Function

create_heatmap_grob <- function(M, title, color_palette) {
  pheatmap_obj <- pheatmap(M,
                           cluster_rows = FALSE,
                           cluster_cols = FALSE,
                           show_rownames = FALSE,
                           show_colnames = TRUE,
                           color = colorRampPalette(c("blue", "white", "red"))(100),
                           main = title,
                           silent = TRUE)  # silent=TRUE 以避免直接打印
  
  grid_plot <- pheatmap_obj$gtable
  return(grid_plot)
}

4. Mixture-cell expression

The observed bulk RNA-seq raw counts for gene g in subject j measured at time point t is generated from Poisson distribution \(Y_{gjt} \sim Pois(\tilde{\lambda}_{gjt})\), where \(\tilde{\lambda}_{gjt} = \lambda_{gtj}^T\theta_{gjt}\), \(\lambda_{gtj}^T = (\lambda_{gjt1}, ..., \lambda_{gjtK})\)

################# Notation Setup #################

#### input ####
# lambda_ls: cell-type-specific reference
# theta: cell-type proportion

#### output ####
# Y: bulk RNA-seq raw counts

#### This `Gen_mix` function is for generating mixture-cell expression

Gen_mix <- function(lambda_ls, theta) {
  n = length(lambda_ls) # the number of samples (J*T)
  G = nrow(lambda_ls[[1]]) # the number of genes
  K = ncol(lambda_ls[[1]]) # the number of cell types
  
  Y = matrix(nrow = G, ncol = n)
  
  for (i in 1:n) {
    mean_value = lambda_ls[[i]] %*% theta[, i]
    Y[, i] = rpois(G, lambda = mean_value)
  }
  
  colnames(Y) = colnames(theta)
  rownames(Y) = rownames(lambda_ls[[1]])
  
  return (Y = Y)
}

5. \(\Delta_0\) = 0

ref = Gen_ref(J=50, T=5, mu_m = mu_m, sigma_m = sigma_m,
                       gene_ls = gene_list, cell_ls = cellType_list, LFC=0.0,  
              k_mu = 0.01, k_delta = 0.1)

meta_data <- ref$meta_data
lambda_list <- ref$lambda_ls
M_list <- ref$M_ls
Phi_list <- ref$Phi_ls
DEG_list <- ref$DEG_ls

head(meta_data)
##   sample_ID group subject_ID time_point
## 1         1  case          1          1
## 2         2  case          1          2
## 3         3  case          1          3
## 4         4  case          1          4
## 5         5  case          1          5
## 6         6  case          2          1
Y <- Gen_mix(lambda_ls = lambda_list, theta = theta)

theta_df = as.data.frame(t(theta))

(1) Heatmap of \(M\) and \(\lambda\)

i. Mean expression \(M\)

heatmap_grobs <- lapply(c(1:5, 126:130), function(i) {
  M <- M_list[[i]]
  if (i %in% 1:5) {
  create_heatmap_grob(M, paste("subject 1 (case) at time point ", i), color_palette) } 
  else {create_heatmap_grob(M, paste("subject 26 (ctrl) at time point ", i-125), color_palette)}
})

for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 第i个热图
    heatmap_grobs[[i + 5]],    # 第i+5个热图
    nrow = 1                   # 一行两列
  )
}

We examine whether DE genes we set are differentially expressed in \(M\) by looking at the corresponding index in case and control.

heatmap_grobs <- lapply(c(1:5), function(i) {
  M_case <- M_list[[i]]
  M_ctrl <- M_list[[i + 125]]
  
  # 创建子集矩阵,只保留每列的250个元素
 M_sub_case <- do.call(cbind, lapply(1:ncol(M_case), function(j) {
    M_case[((j-1)*250 + 1):(j*250), j]
  }))
  
  M_sub_ctrl <- do.call(cbind, lapply(1:ncol(M_ctrl), function(j) {
    M_ctrl[((j-1)*250 + 1):(j*250), j]
  }))
  
  # 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
 M_combined <- cbind(M_sub_case, M_sub_ctrl)
  
  # 创建热图 grob
  create_heatmap_grob(M_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})

# 绘制热图
for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 每个时间点的热图
    nrow = 1                   # 一行一列
  )
}

ii. Ture reference panel \(\lambda\)

heatmap_grobs <- lapply(c(1:5), function(i) {
  lambda_case <- lambda_list[[i]]
  lambda_ctrl <- lambda_list[[i + 125]]
  
  # 创建子集矩阵,只保留每列的250个元素
  lambda_sub_case <- do.call(cbind, lapply(1:ncol(lambda_case), function(j) {
    lambda_case[((j-1)*250 + 1):(j*250), j]
  }))
  
  lambda_sub_ctrl <- do.call(cbind, lapply(1:ncol(lambda_ctrl), function(j) {
    lambda_ctrl[((j-1)*250 + 1):(j*250), j]
  }))
  
  # 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
  lambda_combined <- cbind(lambda_sub_case, lambda_sub_ctrl)
  
  # 创建热图 grob
  create_heatmap_grob(lambda_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})

# 绘制热图
for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 每个时间点的热图
    nrow = 1                   # 一行一列
  )
}

(2) Deconvolve individual-specific reference panel

We deconvolve the individual-specific reference panel with isletSolve() and compare the deconvolved reference panel with true reference panel.

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          #age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrep(dat_se = N25_se)
N25_input

execution_time <- system.time({
  N25_age.ref <- isletSolve(input=N25_input)
})

print(execution_time)
saveRDS(N25_age.ref, file="N25_ref_0_0.rds")
N25_age.ref <- readRDS("N25_ref_0_0.rds")
caseVal <- caseEst(N25_age.ref)
ctrlVal <- ctrlEst(N25_age.ref)

lambda_list_islet <- list()

for (i in 1:50) {
  if (i %in% seq(1,25)) {
    Bcell = caseVal[["B-cells"]][,i]
    CD4 = caseVal[["CD4"]][,i]
    CD8 = caseVal[["CD8"]][,i]
    NK = caseVal[["NK"]][,i]
    Neutrophils = caseVal[["Neutrophils"]][,i]
    Monocytes = caseVal[["Monocytes"]][,i]
    lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
    colnames(lambda) = c("B-cells","CD4", "CD8",  "NK", "Neutrophils", "Monocytes")
    
    lambda_list_islet[[as.character(i)]] <- lambda
    
  }
  
  else {
    Bcell = ctrlVal[["B-cells"]][,i-25]
    CD4 = ctrlVal[["CD4"]][,i-25]
    CD8 = ctrlVal[["CD8"]][,i-25]
    NK = ctrlVal[["NK"]][,i-25]
    Neutrophils = ctrlVal[["Neutrophils"]][,i-25]
    Monocytes = ctrlVal[["Monocytes"]][,i-25]
    lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
    colnames(lambda) = c("B-cells","CD4", "CD8",  "NK", "Neutrophils", "Monocytes")
    
    lambda_list_islet[[as.character(i)]] <- lambda
  }
}

true_ref_case <- lambda_list_islet[[1]][1:50, ]
islet_ref_case <- lambda_list[[1]][1:50, ]
ref_case_combined <- cbind(true_ref_case, islet_ref_case)
pheatmap(ref_case_combined, 
         cluster_rows = FALSE,  # 关闭行聚类
         cluster_cols = FALSE,  # 关闭列聚类
         show_rownames = FALSE,  # 不显示行名
         show_colnames = TRUE,
         main = "True Reference Panel vs Deconvolved Reference Panel (Case)")  # 显示列名

true_ref_ctrl <- lambda_list[[126]][1:50, ]
islet_ref_ctrl <- lambda_list_islet[[26]][1:50, ]
ref_ctrl_combined <- cbind(true_ref_ctrl, islet_ref_ctrl)
pheatmap(ref_ctrl_combined, 
         cluster_rows = FALSE,  # 关闭行聚类
         cluster_cols = FALSE,  # 关闭列聚类
         show_rownames = FALSE,  # 不显示行名
         show_colnames = TRUE,
         main = "True Reference Panel vs Deconvolved Reference Panel (Ctrl)")  # 显示列名

true_ref_case <- lambda_list[[1]]
islet_ref_case <- lambda_list_islet[[1]]
overall_correlation <- cor(c(true_ref_case), c(islet_ref_case))
print(paste("Correlation of case:", overall_correlation))
## [1] "Correlation of case: 0.214532247931833"
plot(c(log(true_ref_case+1)), c(log(islet_ref_case+1)), # +1 to avoid -Inf
     main="ISLET estimated reference panel vs the true reference panel (Case)", 
     xlab="Log(True Reference)",  
     ylab="Log(Estimated Reference)", 
     pch=19, 
     col="blue")

abline(a=0, b=1, col="red", lwd=2) 

true_ref_ctrl <- lambda_list[[126]]
islet_ref_ctrl <- lambda_list_islet[[26]]
overall_correlation <- cor(c(true_ref_ctrl), c(islet_ref_ctrl))
print(paste("Correlation of ctrl:", overall_correlation))
## [1] "Correlation of ctrl: 0.238309536979969"
plot(c(log(true_ref_ctrl+1)), c(log(islet_ref_ctrl+1)), 
     main="ISLET estimated reference panel vs the true reference panel (Ctrl)", 
     xlab="Log(True Reference)",  
     ylab="Log(Estimated Reference)", 
     pch=19,  
     col="blue")

abline(a=0, b=1, col="red", lwd=2) 

(3) Test cell-type-specific differential expression (csDE) genes

Slope test

Since \(\Delta_0 = 0\) in this setting, we only conduct slope test.

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrepSlope(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
N25_input
## First couple of elements from cases and controls: 
##          1    2     3   4     5    6
## FGR   1642 4415 11255 617 13980 6415
## CFH      4  119    13  72     8    7
## FUCA2   58   20  2900  98  1914  143
##       126  127  128  129  130  131
## FGR   727 8200 7330 5902 8592 3099
## CFH     1   44    6   54  392    4
## FUCA2  34   81  378  117  158  182
## Design matrices hidded. 
## Total cell type number: 
## [1] 6
## Cell type categories: 
## [1] "B-cells"     "CD4"         "CD8"         "NK"          "Neutrophils"
## [6] "Monocytes"  
## Total sample number and subject number: 
## [1] 250  50
## Total case number and ctrl number: 
## [1] 25 25
## First several subject ID for the samples: 
##  [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope): 
## [1] "slope"
execution_time <- system.time({
  N25_age.test <- isletTest(input = N25_input)
})

print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_slope_test_0_0.rds")

We used BH procedure to control FDR in multiple testing, and then reported csDEG at pValue < 0.05.

N25_age.test_0.0 <- readRDS("N25_slope_test_0_0.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_0.0))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")

csDEG <- p_values_long %>%
  filter(PValue < 0.05)

top_csDEG <- csDEG %>%
  group_by(CellType) %>%
  arrange(CellType, PValue) %>%
  ungroup()

gene_list_by_celltype <- top_csDEG %>%
  split(.$CellType) %>%
  lapply(function(df) df$Gene)

islet_gene_list_by_celltype  <- list()

for (cell_type in names(gene_list_by_celltype)) {
  factor_vector <- gene_list_by_celltype[[cell_type]]
  
  gene_names <- as.character(factor_vector)
  
  islet_gene_list_by_celltype[[cell_type]] <- gene_names
}


true_gene_list_by_celltype <- list()

for (cell_type in names(DEG_list)) {
  #type1_indices <- DEG_list[[cell_type]]$Type1
  #type3_indices <- DEG_list[[cell_type]]$Type3
  
  all_indices <- DEG_list[[cell_type]]$Type3
  
  genes <- gene_list[all_indices]
  
  true_gene_list_by_celltype[[cell_type]] <- genes
}

num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type

accuracy_list_0.0 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_0.0)) {
  predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
  #type1_indices <- true_gene_list_by_celltype[[cell_type]]
  type3_indices <- true_gene_list_by_celltype[[cell_type]]
  
# Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
  Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
  
  accuracy_list_0.0[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
                                     Type3_accuracy = Type3_accuracy)
  }


print(accuracy_list_0.0)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.072
## 
## 
## $CD4
## $CD4$Type3_accuracy
## [1] 0.068
## 
## 
## $CD8
## $CD8$Type3_accuracy
## [1] 0.068
## 
## 
## $NK
## $NK$Type3_accuracy
## [1] 0.128
## 
## 
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.096
## 
## 
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.148
accuracy_list_0.0_slope <- accuracy_list_0.0

We use ROC curve to illustrate the result.

K = length(cellType_list)
G = length(gene_list)
flag = matrix(0, nrow = G, ncol = K)
rownames(flag) = gene_list
colnames(flag) = cellType_list
for (k in 1:K) {
    type3_indices = DEG_list[[cellType_list[k]]]$Type3
    flag[type3_indices, k] = 1
  }
ROC.DE <- function(DE.gs, pval) {
    pred = prediction(1-pval, DE.gs)
    perf = performance(pred,"tpr","fpr")
    perf
}

roc = ROC.DE(flag, N25_age.test_0.0)

# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")

# 初始化空数据框
roc_data <- data.frame()

# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
  # 提取当前cellType的fpr和tpr
  fpr <- fpr_list[[k]]
  tpr <- tpr_list[[k]]
  
  # 生成临时数据框并添加到总数据框中
  temp_data <- data.frame(
    FPR = fpr,
    TPR = tpr,
    CellType = cellType_list[k]
  )
  
  roc_data <- rbind(roc_data, temp_data)
}

# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
  geom_line(size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "ROC Curve for DE Gene Detection (LFC = 0)",
       x = "False Positive Rate (1-Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(legend.title = element_text(size = 10),
        legend.text = element_text(size = 8),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  scale_color_brewer(palette = "Set1")

6. \(\Delta_0\) = 0.5

ref = Gen_ref(J=50, T=5, mu_m = mu_m, sigma_m = sigma_m,
                       gene_ls = gene_list, cell_ls = cellType_list, LFC=0.5,  
              k_mu = 0.01, k_delta = 0.1)

meta_data <- ref$meta_data
lambda_list <- ref$lambda_ls
M_list <- ref$M_ls
Phi_list <- ref$Phi_ls
DEG_list <- ref$DEG_ls

head(meta_data)
##   sample_ID group subject_ID time_point
## 1         1  case          1          1
## 2         2  case          1          2
## 3         3  case          1          3
## 4         4  case          1          4
## 5         5  case          1          5
## 6         6  case          2          1
Y <- Gen_mix(lambda_ls = lambda_list, theta = theta)

theta_df = as.data.frame(t(theta))

(1) Heatmap of \(M\) and \(\lambda\)

i. Mean expression \(M\)

heatmap_grobs <- lapply(c(1:5, 126:130), function(i) {
  M <- M_list[[i]]
  if (i %in% 1:5) {
  create_heatmap_grob(M, paste("subject 1 (case) at time point ", i), color_palette) } 
  else {create_heatmap_grob(M, paste("subject 26 (ctrl) at time point ", i-125), color_palette)}
})

for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 第i个热图
    heatmap_grobs[[i + 5]],    # 第i+5个热图
    nrow = 1                   # 一行两列
  )
}

We examine whether DE genes we set are differentially expressed in \(M\) by looking at the corresponding index in case and control.

heatmap_grobs <- lapply(c(1:5), function(i) {
  M_case <- M_list[[i]]
  M_ctrl <- M_list[[i + 125]]
  
  # 创建子集矩阵,只保留每列的250个元素
 M_sub_case <- do.call(cbind, lapply(1:ncol(M_case), function(j) {
    M_case[((j-1)*250 + 1):(j*250), j]
  }))
  
  M_sub_ctrl <- do.call(cbind, lapply(1:ncol(M_ctrl), function(j) {
    M_ctrl[((j-1)*250 + 1):(j*250), j]
  }))
  
  # 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
 M_combined <- cbind(M_sub_case, M_sub_ctrl)
  
  # 创建热图 grob
  create_heatmap_grob(M_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})

# 绘制热图
for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 每个时间点的热图
    nrow = 1                   # 一行一列
  )
}

ii. Ture reference panel \(\lambda\)

heatmap_grobs <- lapply(c(1:5), function(i) {
  lambda_case <- lambda_list[[i]]
  lambda_ctrl <- lambda_list[[i + 125]]
  
  # 创建子集矩阵,只保留每列的250个元素
  lambda_sub_case <- do.call(cbind, lapply(1:ncol(lambda_case), function(j) {
    lambda_case[((j-1)*250 + 1):(j*250), j]
  }))
  
  lambda_sub_ctrl <- do.call(cbind, lapply(1:ncol(lambda_ctrl), function(j) {
    lambda_ctrl[((j-1)*250 + 1):(j*250), j]
  }))
  
  # 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
  lambda_combined <- cbind(lambda_sub_case, lambda_sub_ctrl)
  
  # 创建热图 grob
  create_heatmap_grob(lambda_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})

# 绘制热图
for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 每个时间点的热图
    nrow = 1                   # 一行一列
  )
}

isletSolve.test <-function(input, BPPARAM=bpparam() ){
    # islet.solve only runs on the model without age effect.
    if(input@type == 'slope'){
        stop('Input should be prepared by dataPrep()')
    }

    #make Yall a list
    G <- nrow(input@exp_case)
    Yall<-as.matrix(cbind(input@exp_case, input@exp_ctrl))
    aval.nworkers<-BPPARAM$workers
    block.size<-max(ceiling(G/aval.nworkers), 5)
    Yall.list <- split(as.data.frame(Yall), ceiling(seq_len(G)/block.size))

#    if(.Platform$OS.type == "unix") {
    ## do some parallel computation under Unix
#        multicoreParam <- MulticoreParam(workers = ncores)
      res <- bplapply(X=Yall.list, islet.solve.block, datuse=input, BPPARAM=BPPARAM)
#  }
      return(res)
}

(2) Deconvolve individual-specific reference panel

We deconvolve the individual-specific reference panel with isletSolve() and compare the deconvolved reference panel with true reference panel.

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          #age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrep(dat_se = N25_se)
N25_input

execution_time <- system.time({
  N25_age.ref <- isletSolve(input=N25_input)
})

print(execution_time)
saveRDS(N25_age.ref, file="N25_ref_0_5.rds")
N25_age.ref <- readRDS("N25_ref_0_5.rds")
caseVal <- caseEst(N25_age.ref)
ctrlVal <- ctrlEst(N25_age.ref)

lambda_list_islet <- list()

for (i in 1:50) {
  if (i %in% seq(1,25)) {
    Bcell = caseVal[["B-cells"]][,i]
    CD4 = caseVal[["CD4"]][,i]
    CD8 = caseVal[["CD8"]][,i]
    NK = caseVal[["NK"]][,i]
    Neutrophils = caseVal[["Neutrophils"]][,i]
    Monocytes = caseVal[["Monocytes"]][,i]
    lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
    colnames(lambda) = c("B-cells","CD4", "CD8",  "NK", "Neutrophils", "Monocytes")
    
    lambda_list_islet[[as.character(i)]] <- lambda
    
  }
  
  else {
    Bcell = ctrlVal[["B-cells"]][,i-25]
    CD4 = ctrlVal[["CD4"]][,i-25]
    CD8 = ctrlVal[["CD8"]][,i-25]
    NK = ctrlVal[["NK"]][,i-25]
    Neutrophils = ctrlVal[["Neutrophils"]][,i-25]
    Monocytes = ctrlVal[["Monocytes"]][,i-25]
    lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
    colnames(lambda) = c("B-cells","CD4", "CD8",  "NK", "Neutrophils", "Monocytes")
    
    lambda_list_islet[[as.character(i)]] <- lambda
  }
}

true_ref_case <- lambda_list_islet[[1]][1:50, ]
islet_ref_case <- lambda_list[[1]][1:50, ]
ref_case_combined <- cbind(true_ref_case, islet_ref_case)
pheatmap(ref_case_combined, 
         cluster_rows = FALSE,  # 关闭行聚类
         cluster_cols = FALSE,  # 关闭列聚类
         show_rownames = FALSE,  # 不显示行名
         show_colnames = TRUE,
         main = "True Reference Panel vs Deconvolved Reference Panel (Case)")  # 显示列名

true_ref_ctrl <- lambda_list[[126]][1:50, ]
islet_ref_ctrl <- lambda_list_islet[[26]][1:50, ]
ref_ctrl_combined <- cbind(true_ref_ctrl, islet_ref_ctrl)
pheatmap(ref_ctrl_combined, 
         cluster_rows = FALSE,  # 关闭行聚类
         cluster_cols = FALSE,  # 关闭列聚类
         show_rownames = FALSE,  # 不显示行名
         show_colnames = TRUE,
         main = "True Reference Panel vs Deconvolved Reference Panel (Ctrl)")  # 显示列名

true_ref_case <- lambda_list[[1]]
islet_ref_case <- lambda_list_islet[[1]]
overall_correlation <- cor(c(true_ref_case), c(islet_ref_case))
print(paste("Correlation of case:", overall_correlation))
## [1] "Correlation of case: 0.229914795850654"
plot(c(log(true_ref_case+1)), c(log(islet_ref_case+1)), # +1 to avoid -Inf
     main="ISLET estimated reference panel vs the true reference panel (Case)", 
     xlab="Log(True Reference)",  
     ylab="Log(Estimated Reference)", 
     pch=19, 
     col="blue")

abline(a=0, b=1, col="red", lwd=2) 

true_ref_ctrl <- lambda_list[[126]]
islet_ref_ctrl <- lambda_list_islet[[26]]
overall_correlation <- cor(c(true_ref_ctrl), c(islet_ref_ctrl))
print(paste("Correlation of ctrl:", overall_correlation))
## [1] "Correlation of ctrl: 0.253906857043226"
plot(c(log(true_ref_ctrl+1)), c(log(islet_ref_ctrl+1)), 
     main="ISLET estimated reference panel vs the true reference panel (Ctrl)", 
     xlab="Log(True Reference)",  
     ylab="Log(Estimated Reference)", 
     pch=19,  
     col="blue")

abline(a=0, b=1, col="red", lwd=2) 

(3) Test cell-type-specific differential expression (csDE) genes

i. Intercept test

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          #age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrep(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
N25_input
## First couple of elements from cases and controls: 
##          1    2     3   4     5    6
## FGR   1734 7102 13741 929 16142 6666
## CFH      7  198    24 100    13   11
## FUCA2   91   41  4895 169  3025  181
##        126   127  128  129  130  131
## FGR   1169 10682 8723 6347 9193 5170
## CFH      2    71    6  107  642    6
## FUCA2   67   136  609  166  354  299
## Design matrices hidded. 
## Total cell type number: 
## [1] 6
## Cell type categories: 
## [1] "B-cells"     "CD4"         "CD8"         "NK"          "Neutrophils"
## [6] "Monocytes"  
## Total sample number and subject number: 
## [1] 250  50
## Total case number and ctrl number: 
## [1] 25 25
## First several subject ID for the samples: 
##  [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope): 
## [1] "intercept"
execution_time <- system.time({
  N25_age.test <- isletTest(input = N25_input)
})

print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_intercept_test_0_5.rds")

We used BH procedure to control FDR in multiple testing, and then reported csDEG at pValue < 0.05.

N25_age.test_0.5 <- readRDS("N25_intercept_test_0_5.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_0.5))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")

csDEG <- p_values_long %>%
  filter(PValue < 0.05)

top_csDEG <- csDEG %>%
  group_by(CellType) %>%
  arrange(CellType, PValue) %>%
  ungroup()

gene_list_by_celltype <- top_csDEG %>%
  split(.$CellType) %>%
  lapply(function(df) df$Gene)

islet_gene_list_by_celltype  <- list()

for (cell_type in names(gene_list_by_celltype)) {
  factor_vector <- gene_list_by_celltype[[cell_type]]
  
  gene_names <- as.character(factor_vector)
  
  islet_gene_list_by_celltype[[cell_type]] <- gene_names
}


true_gene_list_by_celltype <- list()

for (cell_type in names(DEG_list)) {
  #type1_indices <- DEG_list[[cell_type]]$Type1
  #type3_indices <- DEG_list[[cell_type]]$Type3
  
  all_indices <- DEG_list[[cell_type]]$Type3
  
  genes <- gene_list[all_indices]
  
  true_gene_list_by_celltype[[cell_type]] <- genes
}

num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type

accuracy_list_0.5 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_0.5)) {
  predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
  #type1_indices <- true_gene_list_by_celltype[[cell_type]]
  type3_indices <- true_gene_list_by_celltype[[cell_type]]
  
# Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
  Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
  
  accuracy_list_0.5[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
                                     Type3_accuracy = Type3_accuracy)
  }


print(accuracy_list_0.5)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.052
## 
## 
## $CD4
## $CD4$Type3_accuracy
## [1] 0.048
## 
## 
## $CD8
## $CD8$Type3_accuracy
## [1] 0.104
## 
## 
## $NK
## $NK$Type3_accuracy
## [1] 0.072
## 
## 
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.06
## 
## 
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.204
accuracy_list_0.5_intercept <- accuracy_list_0.5

We use ROC curve to illustrate the result.

K = length(cellType_list)
G = length(gene_list)
flag = matrix(0, nrow = G, ncol = K)
rownames(flag) = gene_list
colnames(flag) = cellType_list
for (k in 1:K) {
    type3_indices = DEG_list[[cellType_list[k]]]$Type3
    flag[type3_indices, k] = 1
  }
ROC.DE <- function(DE.gs, pval) {
    pred = prediction(1-pval, DE.gs)
    perf = performance(pred,"tpr","fpr")
    perf
}

roc = ROC.DE(flag, N25_age.test_0.5)

# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")

# 初始化空数据框
roc_data <- data.frame()

# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
  # 提取当前cellType的fpr和tpr
  fpr <- fpr_list[[k]]
  tpr <- tpr_list[[k]]
  
  # 生成临时数据框并添加到总数据框中
  temp_data <- data.frame(
    FPR = fpr,
    TPR = tpr,
    CellType = cellType_list[k]
  )
  
  roc_data <- rbind(roc_data, temp_data)
}

# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
  geom_line(size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "ROC Curve for DE Gene Detection (LFC = 0.5)",
       x = "False Positive Rate (1-Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(legend.title = element_text(size = 10),
        legend.text = element_text(size = 8),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  scale_color_brewer(palette = "Set1")

ii. Slope test

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          age = meta_data$time_point, 
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrepSlope(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
N25_input
## First couple of elements from cases and controls: 
##          1    2     3   4     5    6
## FGR   1734 7102 13741 929 16142 6666
## CFH      7  198    24 100    13   11
## FUCA2   91   41  4895 169  3025  181
##        126   127  128  129  130  131
## FGR   1169 10682 8723 6347 9193 5170
## CFH      2    71    6  107  642    6
## FUCA2   67   136  609  166  354  299
## Design matrices hidded. 
## Total cell type number: 
## [1] 6
## Cell type categories: 
## [1] "B-cells"     "CD4"         "CD8"         "NK"          "Neutrophils"
## [6] "Monocytes"  
## Total sample number and subject number: 
## [1] 250  50
## Total case number and ctrl number: 
## [1] 25 25
## First several subject ID for the samples: 
##  [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope): 
## [1] "slope"
execution_time <- system.time({
  N25_age.test <- isletTest(input = N25_input)
})

print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_slope_test_0_5.rds")

We used BH procedure to control FDR in multiple testing, and then reported csDEG at pValue < 0.05.

N25_age.test_0.5 <- readRDS("N25_slope_test_0_5.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_0.5))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")

csDEG <- p_values_long %>%
  filter(PValue < 0.05)

top_csDEG <- csDEG %>%
  group_by(CellType) %>%
  arrange(CellType, PValue) %>%
  ungroup()

gene_list_by_celltype <- top_csDEG %>%
  split(.$CellType) %>%
  lapply(function(df) df$Gene)

islet_gene_list_by_celltype  <- list()

for (cell_type in names(gene_list_by_celltype)) {
  factor_vector <- gene_list_by_celltype[[cell_type]]
  
  gene_names <- as.character(factor_vector)
  
  islet_gene_list_by_celltype[[cell_type]] <- gene_names
}


true_gene_list_by_celltype <- list()

for (cell_type in names(DEG_list)) {
  #type1_indices <- DEG_list[[cell_type]]$Type1
  #type3_indices <- DEG_list[[cell_type]]$Type3
  
  all_indices <- DEG_list[[cell_type]]$Type3
  
  genes <- gene_list[all_indices]
  
  true_gene_list_by_celltype[[cell_type]] <- genes
}

num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type

accuracy_list_0.5 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_0.5)) {
  predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
  #type1_indices <- true_gene_list_by_celltype[[cell_type]]
  type3_indices <- true_gene_list_by_celltype[[cell_type]]
  
# Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
  Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
  
  accuracy_list_0.5[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
                                     Type3_accuracy = Type3_accuracy)
  }


print(accuracy_list_0.5)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.08
## 
## 
## $CD4
## $CD4$Type3_accuracy
## [1] 0.084
## 
## 
## $CD8
## $CD8$Type3_accuracy
## [1] 0.076
## 
## 
## $NK
## $NK$Type3_accuracy
## [1] 0.116
## 
## 
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.12
## 
## 
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.136
accuracy_list_0.5_slope <- accuracy_list_0.5

We use ROC curve to illustrate the result.

roc = ROC.DE(flag, N25_age.test_0.5)

# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")

# 初始化空数据框
roc_data <- data.frame()

# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
  # 提取当前cellType的fpr和tpr
  fpr <- fpr_list[[k]]
  tpr <- tpr_list[[k]]
  
  # 生成临时数据框并添加到总数据框中
  temp_data <- data.frame(
    FPR = fpr,
    TPR = tpr,
    CellType = cellType_list[k]
  )
  
  roc_data <- rbind(roc_data, temp_data)
}

# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
  geom_line(size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "ROC Curve for DE Gene Detection (LFC = 0.5)",
       x = "False Positive Rate (1-Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(legend.title = element_text(size = 10),
        legend.text = element_text(size = 8),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  scale_color_brewer(palette = "Set1")

7. \(\Delta_0\) = 1.0

ref = Gen_ref(J=50, T=5, mu_m = mu_m, sigma_m = sigma_m,
                       gene_ls = gene_list, cell_ls = cellType_list, LFC=1.0,  
              k_mu = 0.01, k_delta = 0.1)

meta_data <- ref$meta_data
lambda_list <- ref$lambda_ls
M_list <- ref$M_ls
Phi_list <- ref$Phi_ls
DEG_list <- ref$DEG_ls

head(meta_data)
##   sample_ID group subject_ID time_point
## 1         1  case          1          1
## 2         2  case          1          2
## 3         3  case          1          3
## 4         4  case          1          4
## 5         5  case          1          5
## 6         6  case          2          1
Y <- Gen_mix(lambda_ls = lambda_list, theta = theta)

theta_df = as.data.frame(t(theta))

(1) Heatmap of \(M\) and \(\lambda\)

i. Mean expression \(M\)

heatmap_grobs <- lapply(c(1:5, 126:130), function(i) {
  M <- M_list[[i]]
  if (i %in% 1:5) {
  create_heatmap_grob(M, paste("subject 1 (case) at time point ", i), color_palette) } 
  else {create_heatmap_grob(M, paste("subject 26 (ctrl) at time point ", i-125), color_palette)}
})

for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 第i个热图
    heatmap_grobs[[i + 5]],    # 第i+5个热图
    nrow = 1                   # 一行两列
  )
}

We examine whether DE genes we set are differentially expressed in \(M\) by looking at the corresponding index in case and control.

heatmap_grobs <- lapply(c(1:5), function(i) {
  M_case <- M_list[[i]]
  M_ctrl <- M_list[[i + 125]]
  
  # 创建子集矩阵,只保留每列的250个元素
 M_sub_case <- do.call(cbind, lapply(1:ncol(M_case), function(j) {
    M_case[((j-1)*250 + 1):(j*250), j]
  }))
  
  M_sub_ctrl <- do.call(cbind, lapply(1:ncol(M_ctrl), function(j) {
    M_ctrl[((j-1)*250 + 1):(j*250), j]
  }))
  
  # 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
 M_combined <- cbind(M_sub_case, M_sub_ctrl)
  
  # 创建热图 grob
  create_heatmap_grob(M_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})

# 绘制热图
for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 每个时间点的热图
    nrow = 1                   # 一行一列
  )
}

ii. Ture reference panel \(\lambda\)

heatmap_grobs <- lapply(c(1:5), function(i) {
  lambda_case <- lambda_list[[i]]
  lambda_ctrl <- lambda_list[[i + 125]]
  
  # 创建子集矩阵,只保留每列的250个元素
  lambda_sub_case <- do.call(cbind, lapply(1:ncol(lambda_case), function(j) {
    lambda_case[((j-1)*250 + 1):(j*250), j]
  }))
  
  lambda_sub_ctrl <- do.call(cbind, lapply(1:ncol(lambda_ctrl), function(j) {
    lambda_ctrl[((j-1)*250 + 1):(j*250), j]
  }))
  
  # 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
  lambda_combined <- cbind(lambda_sub_case, lambda_sub_ctrl)
  
  # 创建热图 grob
  create_heatmap_grob(lambda_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})

# 绘制热图
for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 每个时间点的热图
    nrow = 1                   # 一行一列
  )
}

(2) Deconvolve individual-specific reference panel

We deconvolve the individual-specific reference panel with isletSolve() and compare the deconvolved reference panel with true reference panel.

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          #age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrep(dat_se = N25_se)
N25_input

execution_time <- system.time({
  N25_age.ref <- isletSolve(input=N25_input)
})

print(execution_time)
saveRDS(N25_age.ref, file="N25_ref_1_0.rds")
N25_age.ref <- readRDS("N25_ref_1_0.rds")
caseVal <- caseEst(N25_age.ref)
ctrlVal <- ctrlEst(N25_age.ref)

lambda_list_islet <- list()

for (i in 1:50) {
  if (i %in% seq(1,25)) {
    Bcell = caseVal[["B-cells"]][,i]
    CD4 = caseVal[["CD4"]][,i]
    CD8 = caseVal[["CD8"]][,i]
    NK = caseVal[["NK"]][,i]
    Neutrophils = caseVal[["Neutrophils"]][,i]
    Monocytes = caseVal[["Monocytes"]][,i]
    lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
    colnames(lambda) = c("B-cells","CD4", "CD8",  "NK", "Neutrophils", "Monocytes")
    
    lambda_list_islet[[as.character(i)]] <- lambda
    
  }
  
  else {
    Bcell = ctrlVal[["B-cells"]][,i-25]
    CD4 = ctrlVal[["CD4"]][,i-25]
    CD8 = ctrlVal[["CD8"]][,i-25]
    NK = ctrlVal[["NK"]][,i-25]
    Neutrophils = ctrlVal[["Neutrophils"]][,i-25]
    Monocytes = ctrlVal[["Monocytes"]][,i-25]
    lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
    colnames(lambda) = c("B-cells","CD4", "CD8",  "NK", "Neutrophils", "Monocytes")
    
    lambda_list_islet[[as.character(i)]] <- lambda
  }
}

true_ref_case <- lambda_list_islet[[1]][1:50, ]
islet_ref_case <- lambda_list[[1]][1:50, ]
ref_case_combined <- cbind(true_ref_case, islet_ref_case)
pheatmap(ref_case_combined, 
         cluster_rows = FALSE,  # 关闭行聚类
         cluster_cols = FALSE,  # 关闭列聚类
         show_rownames = FALSE,  # 不显示行名
         show_colnames = TRUE,
         main = "True Reference Panel vs Deconvolved Reference Panel (Case)")  # 显示列名

true_ref_ctrl <- lambda_list[[126]][1:50, ]
islet_ref_ctrl <- lambda_list_islet[[26]][1:50, ]
ref_ctrl_combined <- cbind(true_ref_ctrl, islet_ref_ctrl)
pheatmap(ref_ctrl_combined, 
         cluster_rows = FALSE,  # 关闭行聚类
         cluster_cols = FALSE,  # 关闭列聚类
         show_rownames = FALSE,  # 不显示行名
         show_colnames = TRUE,
         main = "True Reference Panel vs Deconvolved Reference Panel (Ctrl)")  # 显示列名

true_ref_case <- lambda_list[[1]]
islet_ref_case <- lambda_list_islet[[1]]
overall_correlation <- cor(c(true_ref_case), c(islet_ref_case))
print(paste("Correlation of case:", overall_correlation))
## [1] "Correlation of case: 0.251721318589678"
plot(c(log(true_ref_case+1)), c(log(islet_ref_case+1)), 
     main="ISLET estimated reference panel vs the true reference panel (Case)", 
     xlab="Log(True Reference)",  
     ylab="Log(Estimated Reference)", 
     pch=19,  
     col="blue")

abline(a=0, b=1, col="red", lwd=2) 

true_ref_ctrl <- lambda_list[[126]]
islet_ref_ctrl <- lambda_list_islet[[26]]
overall_correlation <- cor(c(true_ref_ctrl), c(islet_ref_ctrl))
print(paste("Correlation of ctrl:", overall_correlation))
## [1] "Correlation of ctrl: 0.277678122962485"
plot(c(log(true_ref_ctrl+1)), c(log(islet_ref_ctrl+1)), 
     main="ISLET estimated reference panel vs the true reference panel (Ctrl)", 
     xlab="Log(True Reference)",  
     ylab="Log(Estimated Reference)", 
     pch=19, 
     col="blue")

abline(a=0, b=1, col="red", lwd=2) 

(3) Test cell-type-specific differential expression (csDE) genes

i. Intercept test

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          #age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrep(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
N25_input
## First couple of elements from cases and controls: 
##          1     2     3    4     5    6
## FGR   1885 10212 15038 1384 16607 6720
## CFH     10   315    44  191    25   22
## FUCA2  176    57  7455  282  3578  316
##        126   127   128  129   130  131
## FGR   1835 12316 10007 6748 10296 6258
## CFH      6   137    14  138  1011   13
## FUCA2  101   222  1086  309   499  474
## Design matrices hidded. 
## Total cell type number: 
## [1] 6
## Cell type categories: 
## [1] "B-cells"     "CD4"         "CD8"         "NK"          "Neutrophils"
## [6] "Monocytes"  
## Total sample number and subject number: 
## [1] 250  50
## Total case number and ctrl number: 
## [1] 25 25
## First several subject ID for the samples: 
##  [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope): 
## [1] "intercept"
execution_time <- system.time({
  N25_age.test <- isletTest(input = N25_input)
})

print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_intercept_test_1_0.rds")

We used BH procedure to control FDR in multiple testing, and then reported csDEG at pValue < 0.05.

N25_age.test_1.0 <- readRDS("N25_intercept_test_1_0.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_1.0))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")

csDEG <- p_values_long %>%
  filter(PValue < 0.05)

top_csDEG <- csDEG %>%
  group_by(CellType) %>%
  arrange(CellType, PValue) %>%
  ungroup()

gene_list_by_celltype <- top_csDEG %>%
  split(.$CellType) %>%
  lapply(function(df) df$Gene)

islet_gene_list_by_celltype  <- list()

for (cell_type in names(gene_list_by_celltype)) {
  factor_vector <- gene_list_by_celltype[[cell_type]]
  
  gene_names <- as.character(factor_vector)
  
  islet_gene_list_by_celltype[[cell_type]] <- gene_names
}


true_gene_list_by_celltype <- list()

for (cell_type in names(DEG_list)) {
  #type1_indices <- DEG_list[[cell_type]]$Type1
  #type3_indices <- DEG_list[[cell_type]]$Type3
  
  all_indices <- DEG_list[[cell_type]]$Type3
  
  genes <- gene_list[all_indices]
  
  true_gene_list_by_celltype[[cell_type]] <- genes
}

num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type

accuracy_list_1.0 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_1.0)) {
  predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
  #type1_indices <- true_gene_list_by_celltype[[cell_type]]
  type3_indices <- true_gene_list_by_celltype[[cell_type]]
  
 #Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
 Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
  
  accuracy_list_1.0[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
                                     Type3_accuracy = Type3_accuracy)
  }


print(accuracy_list_1.0)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.056
## 
## 
## $CD4
## $CD4$Type3_accuracy
## [1] 0.048
## 
## 
## $CD8
## $CD8$Type3_accuracy
## [1] 0.108
## 
## 
## $NK
## $NK$Type3_accuracy
## [1] 0.064
## 
## 
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.056
## 
## 
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.228
accuracy_list_1.0_intercept <- accuracy_list_1.0
roc = ROC.DE(flag, N25_age.test_1.0)

# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")

# 初始化空数据框
roc_data <- data.frame()

# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
  # 提取当前cellType的fpr和tpr
  fpr <- fpr_list[[k]]
  tpr <- tpr_list[[k]]
  
  # 生成临时数据框并添加到总数据框中
  temp_data <- data.frame(
    FPR = fpr,
    TPR = tpr,
    CellType = cellType_list[k]
  )
  
  roc_data <- rbind(roc_data, temp_data)
}

# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
  geom_line(size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "ROC Curve for DE Gene Detection (LFC = 1.0)",
       x = "False Positive Rate (1-Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(legend.title = element_text(size = 10),
        legend.text = element_text(size = 8),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  scale_color_brewer(palette = "Set1")

ii. Slope test

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrepSlope(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
N25_input
## First couple of elements from cases and controls: 
##          1     2     3    4     5    6
## FGR   1885 10212 15038 1384 16607 6720
## CFH     10   315    44  191    25   22
## FUCA2  176    57  7455  282  3578  316
##        126   127   128  129   130  131
## FGR   1835 12316 10007 6748 10296 6258
## CFH      6   137    14  138  1011   13
## FUCA2  101   222  1086  309   499  474
## Design matrices hidded. 
## Total cell type number: 
## [1] 6
## Cell type categories: 
## [1] "B-cells"     "CD4"         "CD8"         "NK"          "Neutrophils"
## [6] "Monocytes"  
## Total sample number and subject number: 
## [1] 250  50
## Total case number and ctrl number: 
## [1] 25 25
## First several subject ID for the samples: 
##  [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope): 
## [1] "slope"
execution_time <- system.time({
  N25_age.test <- isletTest(input = N25_input)
})

print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_slope_test_1_0.rds")

We used BH procedure to control FDR in multiple testing, and then reported csDEG at pValue < 0.05.

N25_age.test_1.0 <- readRDS("N25_slope_test_1_0.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_1.0))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")

csDEG <- p_values_long %>%
  filter(PValue < 0.05)

top_csDEG <- csDEG %>%
  group_by(CellType) %>%
  arrange(CellType, PValue) %>%
  ungroup()

gene_list_by_celltype <- top_csDEG %>%
  split(.$CellType) %>%
  lapply(function(df) df$Gene)

islet_gene_list_by_celltype  <- list()

for (cell_type in names(gene_list_by_celltype)) {
  factor_vector <- gene_list_by_celltype[[cell_type]]
  
  gene_names <- as.character(factor_vector)
  
  islet_gene_list_by_celltype[[cell_type]] <- gene_names
}


true_gene_list_by_celltype <- list()

for (cell_type in names(DEG_list)) {
  #type1_indices <- DEG_list[[cell_type]]$Type1
  #type3_indices <- DEG_list[[cell_type]]$Type3
  
  all_indices <- DEG_list[[cell_type]]$Type3
  
  genes <- gene_list[all_indices]
  
  true_gene_list_by_celltype[[cell_type]] <- genes
}

num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type

accuracy_list_1.0 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_1.0)) {
  predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
  #type1_indices <- true_gene_list_by_celltype[[cell_type]]
  type3_indices <- true_gene_list_by_celltype[[cell_type]]
  
 #Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
 Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
  
  accuracy_list_1.0[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
                                     Type3_accuracy = Type3_accuracy)
  }


print(accuracy_list_1.0)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.084
## 
## 
## $CD4
## $CD4$Type3_accuracy
## [1] 0.072
## 
## 
## $CD8
## $CD8$Type3_accuracy
## [1] 0.092
## 
## 
## $NK
## $NK$Type3_accuracy
## [1] 0.108
## 
## 
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.084
## 
## 
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.14
accuracy_list_1.0_slope <- accuracy_list_1.0
roc = ROC.DE(flag, N25_age.test_1.0)

# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")

# 初始化空数据框
roc_data <- data.frame()

# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
  # 提取当前cellType的fpr和tpr
  fpr <- fpr_list[[k]]
  tpr <- tpr_list[[k]]
  
  # 生成临时数据框并添加到总数据框中
  temp_data <- data.frame(
    FPR = fpr,
    TPR = tpr,
    CellType = cellType_list[k]
  )
  
  roc_data <- rbind(roc_data, temp_data)
}

# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
  geom_line(size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "ROC Curve for DE Gene Detection (LFC = 1.0)",
       x = "False Positive Rate (1-Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(legend.title = element_text(size = 10),
        legend.text = element_text(size = 8),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  scale_color_brewer(palette = "Set1")

8. \(\Delta_0\) = 1.5

ref = Gen_ref(J=50, T=5, mu_m = mu_m, sigma_m = sigma_m,
                       gene_ls = gene_list, cell_ls = cellType_list, LFC=1.5,  
              k_mu = 0.01, k_delta = 0.1)

meta_data <- ref$meta_data
lambda_list <- ref$lambda_ls
M_list <- ref$M_ls
Phi_list <- ref$Phi_ls
DEG_list <- ref$DEG_ls

head(meta_data)
##   sample_ID group subject_ID time_point
## 1         1  case          1          1
## 2         2  case          1          2
## 3         3  case          1          3
## 4         4  case          1          4
## 5         5  case          1          5
## 6         6  case          2          1
Y <- Gen_mix(lambda_ls = lambda_list, theta = theta)

theta_df = as.data.frame(t(theta))

(1) Heatmap of \(M\) and \(\lambda\)

i. Mean expression \(M\)

heatmap_grobs <- lapply(c(1:5, 126:130), function(i) {
  M <- M_list[[i]]
  if (i %in% 1:5) {
  create_heatmap_grob(M, paste("subject 1 (case) at time point ", i), color_palette) } 
  else {create_heatmap_grob(M, paste("subject 26 (ctrl) at time point ", i-125), color_palette)}
})

for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 第i个热图
    heatmap_grobs[[i + 5]],    # 第i+5个热图
    nrow = 1                   # 一行两列
  )
}

We examine whether DE genes we set are differentially expressed in \(M\) by looking at the corresponding index in case and control.

heatmap_grobs <- lapply(c(1:5), function(i) {
  M_case <- M_list[[i]]
  M_ctrl <- M_list[[i + 125]]
  
  # 创建子集矩阵,只保留每列的250个元素
 M_sub_case <- do.call(cbind, lapply(1:ncol(M_case), function(j) {
    M_case[((j-1)*250 + 1):(j*250), j]
  }))
  
  M_sub_ctrl <- do.call(cbind, lapply(1:ncol(M_ctrl), function(j) {
    M_ctrl[((j-1)*250 + 1):(j*250), j]
  }))
  
  # 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
 M_combined <- cbind(M_sub_case, M_sub_ctrl)
  
  # 创建热图 grob
  create_heatmap_grob(M_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})

# 绘制热图
for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 每个时间点的热图
    nrow = 1                   # 一行一列
  )
}

ii. Ture reference panel \(\lambda\)

heatmap_grobs <- lapply(c(1:5), function(i) {
  lambda_case <- lambda_list[[i]]
  lambda_ctrl <- lambda_list[[i + 125]]
  
  # 创建子集矩阵,只保留每列的250个元素
  lambda_sub_case <- do.call(cbind, lapply(1:ncol(lambda_case), function(j) {
    lambda_case[((j-1)*250 + 1):(j*250), j]
  }))
  
  lambda_sub_ctrl <- do.call(cbind, lapply(1:ncol(lambda_ctrl), function(j) {
    lambda_ctrl[((j-1)*250 + 1):(j*250), j]
  }))
  
  # 将 case 和 ctrl 的 lambda_sub 进行 cbind 操作
  lambda_combined <- cbind(lambda_sub_case, lambda_sub_ctrl)
  
  # 创建热图 grob
  create_heatmap_grob(lambda_combined, paste("subject 1 (case) and subject 26 (ctrl) at time point ", i), color_palette)
})

# 绘制热图
for (i in 1:5) {
  grid.arrange(
    heatmap_grobs[[i]],        # 每个时间点的热图
    nrow = 1                   # 一行一列
  )
}

(2) Deconvolve individual-specific reference panel

We deconvolve the individual-specific reference panel with isletSolve() and compare the deconvolved reference panel with true reference panel.

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          #age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrep(dat_se = N25_se)
N25_input

execution_time <- system.time({
  N25_age.ref <- isletSolve(input=N25_input)
})

print(execution_time)
saveRDS(N25_age.ref, file="N25_ref_1_5.rds")
N25_age.ref <- readRDS("N25_ref_1_5.rds")
caseVal <- caseEst(N25_age.ref)
ctrlVal <- ctrlEst(N25_age.ref)

lambda_list_islet <- list()

for (i in 1:50) {
  if (i %in% seq(1,25)) {
    Bcell = caseVal[["B-cells"]][,i]
    CD4 = caseVal[["CD4"]][,i]
    CD8 = caseVal[["CD8"]][,i]
    NK = caseVal[["NK"]][,i]
    Neutrophils = caseVal[["Neutrophils"]][,i]
    Monocytes = caseVal[["Monocytes"]][,i]
    lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
    colnames(lambda) = c("B-cells","CD4", "CD8",  "NK", "Neutrophils", "Monocytes")
    
    lambda_list_islet[[as.character(i)]] <- lambda
    
  }
  
  else {
    Bcell = ctrlVal[["B-cells"]][,i-25]
    CD4 = ctrlVal[["CD4"]][,i-25]
    CD8 = ctrlVal[["CD8"]][,i-25]
    NK = ctrlVal[["NK"]][,i-25]
    Neutrophils = ctrlVal[["Neutrophils"]][,i-25]
    Monocytes = ctrlVal[["Monocytes"]][,i-25]
    lambda = t(rbind(Bcell, CD4, CD8, NK, Neutrophils, Monocytes))
    colnames(lambda) = c("B-cells","CD4", "CD8",  "NK", "Neutrophils", "Monocytes")
    
    lambda_list_islet[[as.character(i)]] <- lambda
  }
}

true_ref_case <- lambda_list_islet[[1]][1:50, ]
islet_ref_case <- lambda_list[[1]][1:50, ]
ref_case_combined <- cbind(true_ref_case, islet_ref_case)
pheatmap(ref_case_combined, 
         cluster_rows = FALSE,  # 关闭行聚类
         cluster_cols = FALSE,  # 关闭列聚类
         show_rownames = FALSE,  # 不显示行名
         show_colnames = TRUE,
         main = "True Reference Panel vs Deconvolved Reference Panel (Case)")  # 显示列名

true_ref_ctrl <- lambda_list[[126]][1:50, ]
islet_ref_ctrl <- lambda_list_islet[[26]][1:50, ]
ref_ctrl_combined <- cbind(true_ref_ctrl, islet_ref_ctrl)
pheatmap(ref_ctrl_combined, 
         cluster_rows = FALSE,  # 关闭行聚类
         cluster_cols = FALSE,  # 关闭列聚类
         show_rownames = FALSE,  # 不显示行名
         show_colnames = TRUE,
         main = "True Reference Panel vs Deconvolved Reference Panel (Ctrl)")  # 显示列名

true_ref_case <- lambda_list[[1]]
islet_ref_case <- lambda_list_islet[[1]]
overall_correlation <- cor(c(true_ref_case), c(islet_ref_case))
print(paste("Correlation of case:", overall_correlation))
## [1] "Correlation of case: 0.281991831863852"
plot(c(log(true_ref_case+1)), c(log(islet_ref_case+1)), 
     main="ISLET estimated reference panel vs the true reference panel (Case)", 
     xlab="Log(True Reference)",  
     ylab="Log(Estimated Reference)", 
     pch=19,  
     col="blue")

abline(a=0, b=1, col="red", lwd=2) 

true_ref_ctrl <- lambda_list[[126]]
islet_ref_ctrl <- lambda_list_islet[[26]]
overall_correlation <- cor(c(true_ref_ctrl), c(islet_ref_ctrl))
print(paste("Correlation of ctrl:", overall_correlation))
## [1] "Correlation of ctrl: 0.308103086042903"
plot(c(log(true_ref_ctrl+1)), c(log(islet_ref_ctrl+1)), 
     main="ISLET estimated reference panel vs the true reference panel (Ctrl)", 
     xlab="Log(True Reference)",  
     ylab="Log(Estimated Reference)", 
     pch=19, 
     col="blue")

abline(a=0, b=1, col="red", lwd=2) 

(3) Test cell-type-specific differential expression (csDE) genes

i. Intercept test

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          #age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrep(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
N25_input
## First couple of elements from cases and controls: 
##          1     2     3    4     5    6
## FGR   2133 15317 17342 1744 16880 7120
## CFH     18   519    58  332    37   47
## FUCA2  276    94  7133  501  4472  455
##        126   127   128  129   130  131
## FGR   3027 15125 12162 7453 12052 6922
## CFH      9   203    25  276  1764   22
## FUCA2  160   374  1745  522   786  756
## Design matrices hidded. 
## Total cell type number: 
## [1] 6
## Cell type categories: 
## [1] "B-cells"     "CD4"         "CD8"         "NK"          "Neutrophils"
## [6] "Monocytes"  
## Total sample number and subject number: 
## [1] 250  50
## Total case number and ctrl number: 
## [1] 25 25
## First several subject ID for the samples: 
##  [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope): 
## [1] "intercept"
execution_time <- system.time({
  N25_age.test <- isletTest(input = N25_input)
})

print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_intercept_test_1_5.rds")

We used BH procedure to control FDR in multiple testing, and then reported csDEG at pValue < 0.05.

N25_age.test_1.5 <- readRDS("N25_intercept_test_1_5.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_1.5))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")

csDEG <- p_values_long %>%
  filter(PValue < 0.05)

top_csDEG <- csDEG %>%
  group_by(CellType) %>%
  arrange(CellType, PValue) %>%
  ungroup()

gene_list_by_celltype <- top_csDEG %>%
  split(.$CellType) %>%
  lapply(function(df) df$Gene)

islet_gene_list_by_celltype  <- list()

for (cell_type in names(gene_list_by_celltype)) {
  factor_vector <- gene_list_by_celltype[[cell_type]]
  
  gene_names <- as.character(factor_vector)
  
  islet_gene_list_by_celltype[[cell_type]] <- gene_names
}


true_gene_list_by_celltype <- list()

for (cell_type in names(DEG_list)) {
  #type1_indices <- DEG_list[[cell_type]]$Type1
  #type3_indices <- DEG_list[[cell_type]]$Type3
  
  all_indices <- DEG_list[[cell_type]]$Type3
  
  genes <- gene_list[all_indices]
  
  true_gene_list_by_celltype[[cell_type]] <- genes
}

num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type

accuracy_list_1.5 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_1.5)) {
  predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
  #type1_indices <- true_gene_list_by_celltype[[cell_type]]
  type3_indices <- true_gene_list_by_celltype[[cell_type]]
  
 #Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
 Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
  
  accuracy_list_1.5[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
                                     Type3_accuracy = Type3_accuracy)
  }


print(accuracy_list_1.5)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.06
## 
## 
## $CD4
## $CD4$Type3_accuracy
## [1] 0.052
## 
## 
## $CD8
## $CD8$Type3_accuracy
## [1] 0.12
## 
## 
## $NK
## $NK$Type3_accuracy
## [1] 0.08
## 
## 
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.064
## 
## 
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.264
accuracy_list_1.5_intercept <- accuracy_list_1.5
roc = ROC.DE(flag, N25_age.test_1.5)

# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")

# 初始化空数据框
roc_data <- data.frame()

# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
  # 提取当前cellType的fpr和tpr
  fpr <- fpr_list[[k]]
  tpr <- tpr_list[[k]]
  
  # 生成临时数据框并添加到总数据框中
  temp_data <- data.frame(
    FPR = fpr,
    TPR = tpr,
    CellType = cellType_list[k]
  )
  
  roc_data <- rbind(roc_data, temp_data)
}

# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
  geom_line(size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "ROC Curve for DE Gene Detection (LFC = 1.5)",
       x = "False Positive Rate (1-Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(legend.title = element_text(size = 10),
        legend.text = element_text(size = 8),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  scale_color_brewer(palette = "Set1")

ii. Slope test

sample_info <- data.frame(group = meta_data$group,
                          subject_ID = meta_data$subject_ID,
                          age = meta_data$time_point,
                          'B-cells' = theta_df$`B-cells`,
                          'CD4' = theta_df$CD4,
                          'CD8' = theta_df$CD8,
                          'NK' = theta_df$NK,
                          'Neutrophils' = theta_df$Neutrophils,
                          'Monocytes' = theta_df$Monocytes)
colnames(sample_info)[which(colnames(sample_info) == "B.cells")] <- "B-cells"

N25_se <- SummarizedExperiment(
    assays = list(counts = as.data.frame(Y)),
    colData = sample_info)

N25_input <- dataPrepSlope(dat_se = N25_se)
## Begin: working on data preparation as the input for ISLET algorithm.
## Complete: data preparation for ISLET.
N25_input
## First couple of elements from cases and controls: 
##          1     2     3    4     5    6
## FGR   2133 15317 17342 1744 16880 7120
## CFH     18   519    58  332    37   47
## FUCA2  276    94  7133  501  4472  455
##        126   127   128  129   130  131
## FGR   3027 15125 12162 7453 12052 6922
## CFH      9   203    25  276  1764   22
## FUCA2  160   374  1745  522   786  756
## Design matrices hidded. 
## Total cell type number: 
## [1] 6
## Cell type categories: 
## [1] "B-cells"     "CD4"         "CD8"         "NK"          "Neutrophils"
## [6] "Monocytes"  
## Total sample number and subject number: 
## [1] 250  50
## Total case number and ctrl number: 
## [1] 25 25
## First several subject ID for the samples: 
##  [1] 1 1 1 1 1 2 2 2 2 2
## Data preparation type (intercept/slope): 
## [1] "slope"
execution_time <- system.time({
  N25_age.test <- isletTest(input = N25_input)
})

print(execution_time)
rownames(N25_age.test) <- gene_list
saveRDS(N25_age.test, file="N25_slope_test_1_5.rds")

We used BH procedure to control FDR in multiple testing, and then reported csDEG at pValue < 0.05.

N25_age.test_1.5 <- readRDS("N25_slope_test_1_5.rds")
p_values_long <- as.data.frame(as.table(N25_age.test_1.5))
colnames(p_values_long) <- c("Gene", "CellType", "PValue")
p_values_long$FDR <- p.adjust(p_values_long$PValue, method = "BH")

csDEG <- p_values_long %>%
  filter(PValue < 0.05)

top_csDEG <- csDEG %>%
  group_by(CellType) %>%
  arrange(CellType, PValue) %>%
  ungroup()

gene_list_by_celltype <- top_csDEG %>%
  split(.$CellType) %>%
  lapply(function(df) df$Gene)

islet_gene_list_by_celltype  <- list()

for (cell_type in names(gene_list_by_celltype)) {
  factor_vector <- gene_list_by_celltype[[cell_type]]
  
  gene_names <- as.character(factor_vector)
  
  islet_gene_list_by_celltype[[cell_type]] <- gene_names
}


true_gene_list_by_celltype <- list()

for (cell_type in names(DEG_list)) {
  #type1_indices <- DEG_list[[cell_type]]$Type1
  #type3_indices <- DEG_list[[cell_type]]$Type3
  
  all_indices <- DEG_list[[cell_type]]$Type3
  
  genes <- gene_list[all_indices]
  
  true_gene_list_by_celltype[[cell_type]] <- genes
}

num_DEG = length(DEG_list[[cell_type]]$Type3) # the number of DEG of each type

accuracy_list_1.5 <- setNames(vector("list", length(cellType_list)), cellType_list)
for (cell_type in names(accuracy_list_1.5)) {
  predicted_genes <- islet_gene_list_by_celltype[[cell_type]]
  #type1_indices <- true_gene_list_by_celltype[[cell_type]]
  type3_indices <- true_gene_list_by_celltype[[cell_type]]
  
 #Type1_accuracy <- sum(as.numeric(type1_indices %in% predicted_genes))/num_DEG
 Type3_accuracy <- sum(as.numeric(type3_indices %in% predicted_genes))/num_DEG
  
  accuracy_list_1.5[[cell_type]] <- list(#Type1_accuracy = Type1_accuracy)
                                     Type3_accuracy = Type3_accuracy)
  }


print(accuracy_list_1.5)
## $`B-cells`
## $`B-cells`$Type3_accuracy
## [1] 0.084
## 
## 
## $CD4
## $CD4$Type3_accuracy
## [1] 0.072
## 
## 
## $CD8
## $CD8$Type3_accuracy
## [1] 0.092
## 
## 
## $NK
## $NK$Type3_accuracy
## [1] 0.1
## 
## 
## $Neutrophils
## $Neutrophils$Type3_accuracy
## [1] 0.06
## 
## 
## $Monocytes
## $Monocytes$Type3_accuracy
## [1] 0.136
accuracy_list_1.5_slope <- accuracy_list_1.5
roc = ROC.DE(flag, N25_age.test_1.5)

# 假设 roc 是 ROC.DE 函数生成的对象
tpr_list <- slot(roc, "y.values")
fpr_list <- slot(roc, "x.values")

# 初始化空数据框
roc_data <- data.frame()

# 循环处理每个cellType
for (k in 1:length(cellType_list)) {
  # 提取当前cellType的fpr和tpr
  fpr <- fpr_list[[k]]
  tpr <- tpr_list[[k]]
  
  # 生成临时数据框并添加到总数据框中
  temp_data <- data.frame(
    FPR = fpr,
    TPR = tpr,
    CellType = cellType_list[k]
  )
  
  roc_data <- rbind(roc_data, temp_data)
}

# 使用ggplot2绘制ROC曲线
ggplot(roc_data, aes(x = FPR, y = TPR, color = CellType)) +
  geom_line(size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(title = "ROC Curve for DE Gene Detection (LFC = 1.5)",
       x = "False Positive Rate (1-Specificity)",
       y = "True Positive Rate (Sensitivity)") +
  theme_minimal() +
  theme(legend.title = element_text(size = 10),
        legend.text = element_text(size = 8),
        plot.title = element_text(hjust = 0.5, size = 14)) +
  scale_color_brewer(palette = "Set1")

9. Accuracy at different LFC

(1) Intercept test

df_0.5 <- data.frame(CellType = names(accuracy_list_0.5_intercept),
                     Accuracy = sapply(accuracy_list_0.5_intercept, function(x) x$Type3_accuracy),
                     LFC = 0.5)

df_1.0 <- data.frame(CellType = names(accuracy_list_1.0_intercept),
                     Accuracy = sapply(accuracy_list_1.0_intercept, function(x) x$Type3_accuracy),
                     LFC = 1.0)

df_1.5 <- data.frame(CellType = names(accuracy_list_1.5_intercept),
                     Accuracy = sapply(accuracy_list_1.5_intercept, function(x) x$Type3_accuracy),
                     LFC = 1.5)

# 合并所有的数据框
df_all <- rbind(df_0.5, df_1.0, df_1.5)

# 绘图
p <- ggplot(df_all, aes(x = LFC, y = Accuracy, color = CellType)) +
  geom_line() +
  geom_point() +
  labs(title = "Accuracy vs LFC for Different Cell Types (Intercept Test)",
       x = "LFC",
       y = "Accuracy") +
  theme_minimal()

print(p)

(2) Slope test

df_0.0 <- data.frame(CellType = names(accuracy_list_0.0_slope),
                     Accuracy = sapply(accuracy_list_0.0_slope, function(x) x$Type3_accuracy),
                     LFC = 0.0)

df_0.5 <- data.frame(CellType = names(accuracy_list_0.5_slope),
                     Accuracy = sapply(accuracy_list_0.5_slope, function(x) x$Type3_accuracy),
                     LFC = 0.5)

df_1.0 <- data.frame(CellType = names(accuracy_list_1.0_slope),
                     Accuracy = sapply(accuracy_list_1.0_slope, function(x) x$Type3_accuracy),
                     LFC = 1.0)

df_1.5 <- data.frame(CellType = names(accuracy_list_1.5_slope),
                     Accuracy = sapply(accuracy_list_1.5_slope, function(x) x$Type3_accuracy),
                     LFC = 1.5)

# 合并所有的数据框
df_all <- rbind(df_0.0, df_0.5, df_1.0, df_1.5)

# 绘图
p <- ggplot(df_all, aes(x = LFC, y = Accuracy, color = CellType)) +
  geom_line() +
  geom_point() +
  labs(title = "Accuracy vs LFC for Different Cell Types (Slope Test)",
       x = "LFC",
       y = "Accuracy") +
  theme_minimal()

print(p)